#!/bin/bash

function train_seg() {
	echo "Segmentation Training:"
	if [ ! -f "$number_trailing" ];then
		wget -O "$number_trailing" "$number_trailing_url" || exit 1
	fi

	[ ! -f "$user_combine" ] && touch $user_combine
	[ ! -f "$break" ] && touch $break

	# Build Lexicon
	cat $user_combine | sed -e 's/^/1 /g' > ${build_dir}/user_combine.txt || exit 1
	cat $number_trailing | sed -e 's/^/1 /g' > ${build_dir}/number_trailing.txt || exit 1
	cat $break | sed -e 's/^/1 /g' > ${build_dir}/break.txt || exit 1

	$bin/pdc_normalize -v ${corpus_file} > ${build_dir}/normalized.txt || exit 1
	$bin/ngm_tool -n1 -v ${build_dir}/normalized.txt > ${build_dir}/unigram.txt || exit 1
	$bin/lexicon -v -b -i ${index_dir}/unigram.idx -s ${build_dir}/unigram.txt || exit 1
	$bin/lexicon -v -b -i ${index_dir}/user_combine.idx -s ${build_dir}/user_combine.txt || exit 1
	$bin/lexicon -v -b -i ${index_dir}/number_trailing.idx -s ${build_dir}/number_trailing.txt || exit 1
	$bin/lexicon -v -b -i ${index_dir}/break.idx -s ${build_dir}/break.txt || exit 1


	# Train CRF
	echo 
	echo "1). Training CRF Segment Model. (may take dozens of hours)"
	echo "*). Do nothing."
	read choice
	crf_learn=$(which crf_learn)
	case $choice in
		1)
		if [ -z "$crf_learn" ];then
			echo "crf_learn not found"
			exit
		fi
		$bin/crf2_tool -i ${build_dir}/normalized.txt -o ${build_dir}/pd_crf_seg_training.txt || exit 1
		# Train CRF Seg
		crf_learn -p ${thread_num} -c 4.0 -f 2 ${tmpl_dir}/crf_seg.tmpl ${build_dir}/pd_crf_seg_training.txt ${index_dir}/crf_seg.model || exit 1
		;;
		*) ;;
	esac

	echo "Done."
}

function train_pos() {
	echo "Generating Training file for POS"
	$bin/pdc_pos_tool ${corpus_file} > ${build_dir}/pd_pos_training.txt || exit 1

	echo "Begin CRF Training"
	crf_learn -p ${thread_num} -c 4.0 -f 2 $tmpl_dir/crf_pos.tmpl ${build_dir}/pd_pos_training.txt ${index_dir}/crf_pos.model || exit 1
}

function train_nr() {
	echo "Generating Training file for NR entity recognition"
	$bin/pdc_normalize_4nr -v ${corpus_file} > ${build_dir}/pd_nr.txt || exit 1
	$bin/ner_nr_tool ${build_dir}/pd_nr.txt > ${build_dir}/pd_nr_training.txt || exit 1

	echo "Begin CRF Training"
	crf_learn -p ${thread_num} -c 4.0 -f 2 $tmpl_dir/crf_ner_nr.tmpl ${build_dir}/pd_nr_training.txt ${index_dir}/crf_ner_nr.model || exit 1
}

function train_ns() {
	if [ ! -f "${ns_suffix}" ]; then 
		wget ${ns_suffix_url} -O ${ns_suffix}
		[ "$?" != "0" ] && exit 1 
	fi

	$bin/lexicon -v -b -i ${index_dir}/ns_suffix.idx -s ${ns_suffix} || exit 1

	echo "Generating Training file for NS entity recognition"
	$bin/ner_tool -s ${corpus_file} -t ns > ${build_dir}/pd_ns_training.txt || exit 1

	echo "Begin CRF Training"
	crf_learn -p ${thread_num} -c 4.0 -f 2 $tmpl_dir/crf_ner_ns.tmpl ${build_dir}/pd_ns_training.txt ${index_dir}/crf_ner_ns.model || exit 1
}

function train_nt() {
	echo "Generating Training file for NT entity recognition"
	$bin/ner_tool -s ${corpus_file} -t nt > ${build_dir}/pd_nt_training.txt || exit 1

	echo "Begin CRF Training"
	crf_learn -p ${thread_num} -c 4.0 -f 2 $tmpl_dir/crf_ner_nt.tmpl ${build_dir}/pd_nt_training.txt ${index_dir}/crf_ner_nt.model || exit 1
}

function train_keyword() {
	if [ -z ${corpus_dir} ]
	then
		echo "You must specify corpus dir"
		echo "$USAGE"
		exit 1
	fi

	[ ! -f "$ke_filter_dict" ] && touch $ke_filter_dict
	cat $ke_filter_dict | sed -e 's/^/1 /g' > ${build_dir}/ke_filter.txt || exit 1
	$bin/lexicon -v -b -i ${index_dir}/filter_word.idx -s ${build_dir}/ke_filter.txt || exit 1

	cp $top/etc/keyword.conf ${build_dir}/training_keyword.conf
	top_enc=${top//\//\\\/}
	sed -i "s/^\s*root\s*=.*$/root = ${top_enc}/g" ${build_dir}/training_keyword.conf 

	$bin/kea_word_train -c ${build_dir}/training_keyword.conf -s ${corpus_dir} -o $build_dir/kea_corpus_plain_text.txt || exit 1
	$bin/kea_wordaff_train -c ${build_dir}/training_keyword.conf -f $build_dir/kea_corpus_plain_text.txt || exit 1
}

USAGE="auto_build -t train_type{seg,nr,ns,nt,pos,keyword} [-f corpus_file|-d corpus_dir] [-p thread_num]"

while getopts  "d:f:t:p:h" flag
do
	case $flag in
		d) corpus_dir=$OPTARG
		;;
		f) corpus_file=$OPTARG
		;;
		t) train_type=$OPTARG
		;;
		p) thread_num=$OPTARG
		;;
		h) echo $USAGE; exit 0
		;;
		?) echo "Invalid option"
		;;
	esac
done

if [ -z ${train_type} ]; then
	echo $USAGE
	exit 1
fi

if [ -z ${thread_num} ];then
	thread_num=1
fi

top="$(dirname $(readlink -f $0))/.."
source $top/etc/build_settings

mkdir -p $data_dir $build_dir $index_dir

if [ -z ${corpus_file} ]; then
	corpus_file=$corpus
	if [ ! -f "$corpus" ]; then 
		wget -O - $corpus_url | gzip -cd > $corpus
		[ "$?" != "0" ] && exit 1 
	fi
fi

if [ ${train_type} == "seg" ];
then
	train_seg
elif [ ${train_type} == "nr" ];
then
	train_nr
elif [ ${train_type} == "ns" ];
then
	train_ns
elif [ ${train_type} == "nt" ];
then
	train_nt
elif [ ${train_type} == "pos" ];
then
	train_pos
elif [ ${train_type} == "keyword" ];
then
	train_keyword
fi

